#install.packages("magrittr")
#install.packages("dplyr", dependencies=TRUE)
library(reshape2)
library(magrittr)
library(MASS)
### Load Data
data(OJ, package = "ISLR")
# count rows
data <- na.omit(OJ)
nrow(data)
[1] 1070
head(data)
str(data)
'data.frame': 1070 obs. of 18 variables:
$ Purchase : Factor w/ 2 levels "CH","MM": 1 1 1 2 1 1 1 1 1 1 ...
$ WeekofPurchase: num 237 239 245 227 228 230 232 234 235 238 ...
$ StoreID : num 1 1 1 1 7 7 7 7 7 7 ...
$ PriceCH : num 1.75 1.75 1.86 1.69 1.69 1.69 1.69 1.75 1.75 1.75 ...
$ PriceMM : num 1.99 1.99 2.09 1.69 1.69 1.99 1.99 1.99 1.99 1.99 ...
$ DiscCH : num 0 0 0.17 0 0 0 0 0 0 0 ...
$ DiscMM : num 0 0.3 0 0 0 0 0.4 0.4 0.4 0.4 ...
$ SpecialCH : num 0 0 0 0 0 0 1 1 0 0 ...
$ SpecialMM : num 0 1 0 0 0 1 1 0 0 0 ...
$ LoyalCH : num 0.5 0.6 0.68 0.4 0.957 ...
$ SalePriceMM : num 1.99 1.69 2.09 1.69 1.69 1.99 1.59 1.59 1.59 1.59 ...
$ SalePriceCH : num 1.75 1.75 1.69 1.69 1.69 1.69 1.69 1.75 1.75 1.75 ...
$ PriceDiff : num 0.24 -0.06 0.4 0 0 0.3 -0.1 -0.16 -0.16 -0.16 ...
$ Store7 : Factor w/ 2 levels "No","Yes": 1 1 1 1 2 2 2 2 2 2 ...
$ PctDiscMM : num 0 0.151 0 0 0 ...
$ PctDiscCH : num 0 0 0.0914 0 0 ...
$ ListPriceDiff : num 0.24 0.24 0.23 0 0 0.3 0.3 0.24 0.24 0.24 ...
$ STORE : num 1 1 1 1 0 0 0 0 0 0 ...
contrasts(data$Purchase)
MM
CH 0
MM 1
new_data <- data
new_dataY <- array(rep(0), dim=c(nrow(data),2))
for(row in 1:nrow(data)){
if(new_data[row,1] == "CH"){
new_dataY[row,1] = 1
}else
new_dataY[row, 2] = 1
}
head(new_dataY)
[,1] [,2]
[1,] 1 0
[2,] 1 0
[3,] 1 0
[4,] 0 1
[5,] 1 0
[6,] 1 0
head(new_data)
#new_data[,19] <- new_dataY
#head(new_data)
# set CH as 1. 0 othervise
new_data$Purchase <- as.numeric(new_data$Purchase == "CH")
head(new_data)
# add new columns of categorization target variable
#ndata <- data.frame(data.matrix(cbind(new_data,new_dataY)))
#head(ndata)
new_data <- data.frame(data.matrix(new_data))
set.seed(123)
ind <- sample(c(TRUE, FALSE), nrow(new_data), replace=TRUE, prob=c(0.7, 0.3))
train_data <- new_data[ind, ]
test_data <- new_data[!ind, ]
train_data
mod.ind <- lm(Purchase ~ ., data = train_data)
#mod.ind2 <- lm(lpurchase ~., data = train_data)
#summary(mod.ind)
mod.ind
Call:
lm(formula = Purchase ~ ., data = train_data)
Coefficients:
(Intercept) WeekofPurchase StoreID PriceCH PriceMM DiscCH
0.193112 0.002708 0.041636 -0.915222 0.457200 -3.368190
DiscMM SpecialCH SpecialMM LoyalCH SalePriceMM SalePriceCH
-5.021630 -0.038520 -0.056174 0.972165 NA NA
PriceDiff Store7 PctDiscMM PctDiscCH ListPriceDiff STORE
NA -0.160445 9.939498 7.006355 NA NA
pred1 <- predict(mod.ind, newdata = test_data)
prediction from a rank-deficient fit may be misleading
head(pred1)
2 4 5 8 11 16
0.5486662 0.3037390 0.9368644 1.0066937 1.0278263 0.7779361
predicted <- predict(mod.ind, newdata = test_data)
prediction from a rank-deficient fit may be misleading
TAB <- table(test_data$Purchase, predicted >= 0.5)
head(TAB)
FALSE TRUE
0 87 25
1 33 167
mklrate<- 1-sum(diag(TAB))/sum(TAB)
mklrate
[1] 0.1858974
Missclasification rate with indicator variable is 18.5%
head(new_data)
I have tried transforming categorical and factor fariables to matrices with equal distances. Furthermore, I used principal component scores as new X data to resolve colinearity/singularity. #### One train one test set
mod.lda<-lda(Purchase~., data = train_data)
variables are collinear
mod.lda
Call:
lda(Purchase ~ ., data = train_data)
Prior probabilities of groups:
0 1
0.4023747 0.5976253
Group means:
WeekofPurchase StoreID PriceCH PriceMM DiscCH DiscMM SpecialCH SpecialMM LoyalCH
0 252.9443 3.242623 1.873967 2.074328 0.02560656 0.16924590 0.0852459 0.2590164 0.3203285
1 255.5254 4.448124 1.866799 2.099117 0.06532009 0.09523179 0.1810155 0.1214128 0.7324193
SalePriceMM SalePriceCH PriceDiff Store7 PctDiscMM PctDiscCH ListPriceDiff STORE
0 1.905082 1.848361 0.05672131 1.196721 0.08079513 0.01323538 0.2003607 1.865574
1 2.003885 1.801479 0.20240618 1.421634 0.04605959 0.03452180 0.2323179 1.496689
Coefficients of linear discriminants:
LD1
WeekofPurchase 0.01103305
StoreID 0.06541999
PriceCH -7.02382050
PriceMM -5.71551820
DiscCH -12.90159252
DiscMM -10.41260809
SpecialCH -0.15696440
SpecialMM -0.22889930
LoyalCH 3.96141848
SalePriceMM 6.00531439
SalePriceCH 4.86765720
PriceDiff 4.04441675
Store7 0.07590531
PctDiscMM 40.50186192
PctDiscCH 28.54977401
ListPriceDiff -2.47119502
STORE 0.10424162
predicted <- predict(mod.lda, newdata = test_data)
TAB <- table(test_data$Purchase, predicted$class)
head(TAB)
0 1
0 87 25
1 33 167
mklrate<- 1-sum(diag(TAB))/sum(TAB)
mklrate
[1] 0.1858974
train_data
mypred = function(object,newdata) UseMethod("mypred", object)
mypred.lda <- function(object, newdata){
predict(object, newdata=newdata)$class
}
library(ipred)
CEE = control.errorest(k = 5, nboot=10)
ldacvest <- errorest(Purchase ~ ., data = data, model=lda, est.para=CEE, predict=mypred)
variables are collinearvariables are collinearvariables are collinearvariables are collinearvariables are collinear
ldacvest
Call:
errorest.data.frame(formula = Purchase ~ ., data = data, model = lda,
predict = mypred, est.para = CEE)
5-fold cross-validation estimator of misclassification error
Misclassification error: 0.1682
new_data$rowID <- 1:nrow(new_data)
head(new_data)
# columns to convert
columns_to_convert <- names(new_data) %in% c("StoreID", "SpecialCH", "SpecialMM", "Store7", "STORE", "rowID")
#columns_to_convert
selected_columns_data <- new_data[columns_to_convert]
# new data column converted
new_data_cc <- recast(selected_columns_data, rowID ~ variable + value, id.var = c("rowID"), fun.aggregate = function(x) (length(x) > 0) + 0L)
head(new_data_cc)
original_columns <- new_data[!columns_to_convert]
new_data_columns_converted <- cbind(original_columns, new_data_cc)
new_data_columns_converted$rowID <- NULL
new_data_columns_converted
set.seed(123)
ind <- sample(c(TRUE, FALSE), nrow(new_data_columns_converted), replace=TRUE, prob=c(0.7, 0.3))
# train data converted columns
train_data_cc <- new_data_columns_converted[ind, ]
# test data converted columns
test_data_cc <- new_data_columns_converted[!ind, ]
train_data_cc
mod.lda2 <-lda(Purchase~., data = train_data_cc)
variables are collinear
mod.lda2
Call:
lda(Purchase ~ ., data = train_data_cc)
Prior probabilities of groups:
0 1
0.4023747 0.5976253
Group means:
WeekofPurchase PriceCH PriceMM DiscCH DiscMM LoyalCH SalePriceMM SalePriceCH PriceDiff
0 252.9443 1.873967 2.074328 0.02560656 0.16924590 0.3203285 1.905082 1.848361 0.05672131
1 255.5254 1.866799 2.099117 0.06532009 0.09523179 0.7324193 2.003885 1.801479 0.20240618
PctDiscMM PctDiscCH ListPriceDiff StoreID_1 StoreID_2 StoreID_3 StoreID_4 StoreID_7 SpecialCH_0
0 0.08079513 0.01323538 0.2003607 0.1836066 0.2459016 0.3049180 0.06885246 0.1967213 0.9147541
1 0.04605959 0.03452180 0.2323179 0.1258278 0.1633554 0.1125828 0.17660044 0.4216336 0.8189845
SpecialCH_1 SpecialMM_0 SpecialMM_1 Store7_1 Store7_2 STORE_0 STORE_1 STORE_2 STORE_3
0 0.0852459 0.7409836 0.2590164 0.8032787 0.1967213 0.1967213 0.1836066 0.2459016 0.3049180
1 0.1810155 0.8785872 0.1214128 0.5783664 0.4216336 0.4216336 0.1258278 0.1633554 0.1125828
STORE_4
0 0.06885246
1 0.17660044
Coefficients of linear discriminants:
LD1
WeekofPurchase 0.01009241
PriceCH -6.78358856
PriceMM -5.82065943
DiscCH -12.44712324
DiscMM -10.76476287
LoyalCH 3.95673884
SalePriceMM 6.23281734
SalePriceCH 4.69241556
PriceDiff 4.29746981
PctDiscMM 42.24886370
PctDiscCH 26.89856600
ListPriceDiff -2.85129323
StoreID_1 -0.14704949
StoreID_2 -0.01182117
StoreID_3 0.01566897
StoreID_4 0.11328117
StoreID_7 0.02553317
SpecialCH_0 0.07473489
SpecialCH_1 -0.07473489
SpecialMM_0 0.12394666
SpecialMM_1 -0.12394666
Store7_1 -0.02553317
Store7_2 0.02553317
STORE_0 0.02553317
STORE_1 -0.14704949
STORE_2 -0.01182117
STORE_3 0.01566897
STORE_4 0.11328117
plot(mod.lda2)
predicted <- predict(mod.lda2, newdata = test_data_cc)
TAB <- table(test_data_cc$Purchase, predicted$class)
head(TAB)
0 1
0 86 26
1 32 168
mklrate<- 1-sum(diag(TAB))/sum(TAB)
mklrate
[1] 0.1858974
names(train_data_cc[-1])
[1] "WeekofPurchase" "PriceCH" "PriceMM" "DiscCH" "DiscMM"
[6] "LoyalCH" "SalePriceMM" "SalePriceCH" "PriceDiff" "PctDiscMM"
[11] "PctDiscCH" "ListPriceDiff" "StoreID_1" "StoreID_2" "StoreID_3"
[16] "StoreID_4" "StoreID_7" "SpecialCH_0" "SpecialCH_1" "SpecialMM_0"
[21] "SpecialMM_1" "Store7_1" "Store7_2" "STORE_0" "STORE_1"
[26] "STORE_2" "STORE_3" "STORE_4"
X_train = train_data_cc[-1]
X_test = test_data_cc[-1]
pca_object = princomp(X_train, center=TRUE)
summary(pca_object)
Importance of components:
Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6
Standard deviation 15.4412681 1.031326460 0.683781810 0.582903059 0.56523276 0.4886168271
Proportion of Variance 0.9881188 0.004407939 0.001937663 0.001408108 0.00132403 0.0009894187
Cumulative Proportion 0.9881188 0.992526786 0.994464449 0.995872557 0.99719659 0.9981860061
Comp.7 Comp.8 Comp.9 Comp.10 Comp.11 Comp.12
Standard deviation 0.4529214441 0.3371002340 0.2644087307 0.1670275410 1.291733e-01 6.652160e-02
Proportion of Variance 0.0008501372 0.0004709354 0.0002897309 0.0001156164 6.914948e-05 1.833871e-05
Cumulative Proportion 0.9990361433 0.9995070787 0.9997968097 0.9999124260 9.999816e-01 9.999999e-01
Comp.13 Comp.14 Comp.15 Comp.16 Comp.17 Comp.18 Comp.19
Standard deviation 3.990375e-03 2.184297e-03 4.335038e-09 3.140639e-09 5.334464e-10 0 0
Proportion of Variance 6.598887e-08 1.977273e-08 7.788057e-20 4.087702e-20 1.179301e-21 0 0
Cumulative Proportion 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1 1
Comp.20 Comp.21 Comp.22 Comp.23 Comp.24 Comp.25 Comp.26 Comp.27 Comp.28
Standard deviation 0 0 0 0 0 0 0 0 0
Proportion of Variance 0 0 0 0 0 0 0 0 0
Cumulative Proportion 1 1 1 1 1 1 1 1 1
plot(pca_object)
biplot(pca_object)
X_train_scores <- pca_object$scores
head(X_train_scores)
Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8 Comp.9
1 -17.492186 -0.5793143 0.10723536 0.95829098 -0.090953369 0.8259175 -0.3063367 -0.0314833 0.01056715
3 -9.490394 -0.5816134 0.02492143 0.95974270 -0.032556807 0.8837776 -0.3501329 0.1280174 -0.12811371
6 -24.478514 1.3599933 0.84170510 0.04996699 -0.302921417 -0.7428826 -0.4191345 0.7448345 -0.38997608
7 -22.474511 1.6707964 1.00088568 0.20046650 -0.387005707 -0.5713587 1.0646684 0.4864445 -0.42017807
9 -19.481040 1.4439389 0.24271302 0.01041531 0.001903890 -0.2875141 -0.2790435 -0.6266711 -0.40148980
10 -16.481215 1.4321803 0.23113787 0.01514940 -0.003798387 -0.2855126 -0.2835574 -0.6320358 -0.39632281
Comp.10 Comp.11 Comp.12 Comp.13 Comp.14 Comp.15 Comp.16
1 -0.010539986 -0.02959065 -0.033227914 0.004570808 1.043249e-03 -8.604228e-16 -2.081668e-15
3 0.080059262 -0.15111491 -0.172008045 0.003485937 -1.640475e-03 4.135581e-15 6.133982e-15
6 -0.010017447 0.03405763 0.070250389 0.003809530 3.744344e-04 -4.232725e-16 -9.714451e-16
7 -0.237593205 0.07816845 0.083175250 -0.005541074 2.857936e-03 -6.529499e-15 -8.631984e-15
9 -0.004054360 -0.14541090 -0.016466460 -0.006747926 1.376130e-04 -1.255940e-15 -1.026956e-15
10 0.008266731 -0.13808417 0.004811616 -0.007057565 7.301851e-05 -1.255940e-15 -1.026956e-15
Comp.17 Comp.18 Comp.19 Comp.20 Comp.21 Comp.22 Comp.23
1 1.700029e-16 -4.198031e-16 -2.463307e-16 -1.158795e-15 -7.619773e-16 -4.392320e-15 2.810252e-15
3 1.391248e-15 -3.195361e-15 1.141448e-15 6.245005e-17 2.513181e-15 3.513162e-14 -6.029899e-15
6 6.834810e-16 -1.169204e-15 3.469447e-18 -1.242062e-15 -1.039533e-15 2.546574e-15 1.353084e-15
7 -2.536166e-15 5.381112e-15 -1.883910e-15 5.065393e-16 -3.454268e-15 -5.796058e-14 8.514023e-15
9 -1.425943e-15 2.550044e-15 -6.071532e-16 1.366962e-15 3.343679e-16 -1.846440e-14 7.147061e-16
10 -1.425943e-15 2.550044e-15 -6.071532e-16 1.366962e-15 3.343679e-16 -1.846440e-14 7.147061e-16
Comp.24 Comp.25 Comp.26 Comp.27 Comp.28
1 5.915407e-16 -5.516421e-16 1.346145e-15 -1.831868e-15 3.386180e-15
3 4.921411e-15 4.055783e-15 -2.539635e-15 1.249001e-15 1.998401e-15
6 1.812786e-15 3.712308e-16 1.304512e-15 -1.977585e-15 4.218847e-15
7 -9.067400e-15 -6.345618e-15 3.858025e-15 -1.033895e-15 -5.329071e-15
9 -4.237929e-15 -2.099015e-15 -1.665335e-16 1.602884e-15 -5.800915e-15
10 -4.237929e-15 -2.099015e-15 -1.665335e-16 1.575129e-15 -5.800915e-15
new_df_train_pc <- data.frame(X_train_scores)
new_df_test <- data.frame(X_test)
#colnames(new_df) = column_names
# train
new_df_train_pc["Purchase"] <- train_data_cc$Purchase
test.data_pc <- predict(pca_object, newdata = new_df_test)
test.data_pc <- as.data.frame(test.data_pc)
test.data_pc["Purchase"] <- test_data_cc$Purchase
head(test.data_pc)
head(new_df_train_pc)
mat.pc.lda <- lda(Purchase ~Comp.1+Comp.2+Comp.3+Comp.4+Comp.5+Comp.6+Comp.7+Comp.8+Comp.9, new_df_train_pc)
plot(mat.pc.lda)
test.data_pc[c("Comp.1","Comp.2")]
predicted <- predict(mat.pc.lda, newdata = test.data_pc[c("Comp.1","Comp.2","Comp.3","Comp.4","Comp.5","Comp.6","Comp.7","Comp.8","Comp.9")])
#predicted$class
TAB <- table(test.data_pc$Purchase, predicted$class)
head(TAB)
0 1
0 89 23
1 32 168
mklrate<- 1-sum(diag(TAB))/sum(TAB)
mklrate
[1] 0.1762821
head(new_df_train_pc)
head(test.data_pc)
Singularity problem can be solved by using principal components
mod.qda<-qda(formula = Purchase~Comp.1+Comp.2+Comp.3+Comp.4+Comp.5+Comp.6+Comp.7+Comp.8+Comp.9+Comp.10+Comp.11+Comp.12, data = new_df_train_pc)
predictqda <- predict(mod.qda, newdata = test.data_pc)
#nrow(test_data)
#predictqda
TAB <- table(test.data_pc$Purchase, predictqda$class)
mkrqda<- 1-sum(diag(TAB)/sum(TAB))
mkrqda
[1] 0.1891026
head(train_data_cc)
head(test_data_cc)
mod.rda<-rda(Purchase~., data = train_data_cc)
#mod.rda$lambda
predictrda<-predict(mod.rda,test_data_cc)
#predictrda
TAB<-table(test_data_cc$Purchase, predictrda$class)
mkrrda<-1-sum(diag(TAB))/sum(TAB)
mkrrda
[1] 0.1891026
mod.rda<-rda(Purchase ~ Comp.1+Comp.2+Comp.3+Comp.4+Comp.5+Comp.6+Comp.7+Comp.8+Comp.9+Comp.10+Comp.11+Comp.12+Comp.13+Comp.14+Comp.15+Comp.16+Comp.17, data = new_df_train_pc, train.fraction=0.6)
mod.rda$lambda
NULL
predictrda<-predict(mod.rda,test.data_pc)
#predictrda
TAB<-table(test_data_cc$Purchase, predictrda$class)
mkrrda<-1-sum(diag(TAB))/sum(TAB)
mkrrda
[1] 0.1666667
mod<-lm(Purchase~.,data = new_df_train_pc)
summary(mod)
Call:
lm(formula = Purchase ~ ., data = new_df_train_pc)
Residuals:
Min 1Q Median 3Q Max
-1.11740 -0.19875 0.00596 0.21954 0.99304
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 7.572e-01 7.630e-02 9.923 < 2e-16 ***
Comp.1 3.679e-04 1.611e-03 0.228 0.81945
Comp.2 2.000e-01 4.594e-01 0.435 0.66349
Comp.3 1.890e-01 3.068e-01 0.616 0.53811
Comp.4 -1.721e-01 2.344e-01 -0.734 0.46320
Comp.5 7.248e-01 3.142e-01 2.306 0.02137 *
Comp.6 -5.873e-02 2.497e-01 -0.235 0.81410
Comp.7 2.971e-01 6.465e-01 0.460 0.64597
Comp.8 7.405e-01 2.766e-01 2.677 0.00759 **
Comp.9 -6.039e-01 1.017e+00 -0.594 0.55285
Comp.10 2.058e+00 1.221e+00 1.685 0.09239 .
Comp.11 4.676e+00 1.705e+00 2.742 0.00626 **
Comp.12 1.172e+00 1.463e+01 0.080 0.93622
Comp.13 -1.151e+03 1.599e+03 -0.720 0.47157
Comp.14 6.463e+03 8.713e+03 0.742 0.45849
Comp.15 -1.598e+13 4.511e+14 -0.035 0.97175
Comp.16 4.659e+14 5.236e+14 0.890 0.37385
Comp.17 -7.220e+14 7.018e+14 -1.029 0.30388
Comp.18 -8.021e+14 5.156e+14 -1.556 0.12020
Comp.19 6.811e+14 4.034e+14 1.689 0.09174 .
Comp.20 -9.114e+14 7.418e+14 -1.229 0.21965
Comp.21 8.990e+14 4.395e+14 2.045 0.04118 *
Comp.22 1.915e+14 5.597e+14 0.342 0.73239
Comp.23 4.025e+14 6.637e+14 0.606 0.54438
Comp.24 6.202e+14 5.767e+14 1.076 0.28247
Comp.25 4.627e+13 6.108e+14 0.076 0.93964
Comp.26 -1.313e+15 6.324e+14 -2.076 0.03828 *
Comp.27 -1.328e+15 6.912e+14 -1.921 0.05515 .
Comp.28 -5.138e+14 7.692e+14 -0.668 0.50438
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.3528 on 729 degrees of freedom
Multiple R-squared: 0.5023, Adjusted R-squared: 0.4832
F-statistic: 26.28 on 28 and 729 DF, p-value: < 2.2e-16
d <- read.csv2("bank.csv")
d
d$y <- as.numeric(d$y == "yes")
d
d$rowID <- 1:nrow(d)
#head(d)
# columns to convert
columns_to_convert <- names(d) %in% c("job", "marital", "education", "default", "housing", "loan","contact","poutcome","rowID")
#columns_to_convert
selected_columns_data <- d[columns_to_convert]
# new data column converted
new_d_cc <- recast(selected_columns_data, rowID ~ variable + value, id.var = c("rowID"), fun.aggregate = function(x) (length(x) > 0) + 0L)
attributes are not identical across measure variables; they will be dropped
new_d_cc
original_columns <- d[!columns_to_convert]
new_d_cc <- cbind(original_columns, new_d_cc)
new_d_cc$rowID <- NULL
new_d_cc
new_d_cc <- data.frame(data.matrix(new_d_cc))
new_d_cc
sample_ind <- sample(nrow(new_d_cc), 3000)
d_train <- d[sample_ind,]
d_test <- d[-sample_ind,]
plot(d_train$y)
mod.lda3 <-lda(y~., data = d[sample_ind,])
mod.lda3
Call:
lda(y ~ ., data = d[sample_ind, ])
Prior probabilities of groups:
0 1
0.8896667 0.1103333
Group means:
age jobblue-collar jobentrepreneur jobhousemaid jobmanagement jobretired jobself-employed
0 41.07981 0.2180592 0.03709254 0.02547771 0.2094417 0.04196328 0.04233795
1 42.39577 0.1389728 0.03625378 0.01812689 0.2326284 0.09667674 0.03625378
jobservices jobstudent jobtechnician jobunemployed jobunknown maritalmarried maritalsingle
0 0.09291870 0.01611090 0.1783439 0.02922443 0.007118771 0.6309479 0.2529037
1 0.06344411 0.03927492 0.1752266 0.03021148 0.012084592 0.5075529 0.3383686
educationsecondary educationtertiary educationunknown defaultyes balance housingyes loanyes
0 0.5133009 0.2922443 0.04083927 0.01760959 1423.451 0.5811165 0.1659798
1 0.4743202 0.3625378 0.03323263 0.01812689 1537.154 0.4561934 0.0815710
contacttelephone contactunknown day monthaug monthdec monthfeb monthjan monthjul
0 0.06444361 0.3195954 16.04646 0.1427501 0.003372049 0.04533533 0.03259648 0.1637317
1 0.07250755 0.1208459 15.60423 0.1510574 0.021148036 0.06948640 0.03625378 0.1238671
monthjun monthmar monthmay monthnov monthoct monthsep duration campaign pdays
0 0.11352567 0.007493443 0.3252154 0.08542525 0.01198951 0.007118771 226.0367 2.878606 36.67141
1 0.09969789 0.045317221 0.1933535 0.07250755 0.05135952 0.033232628 572.7341 2.193353 71.57402
previous poutcomeother poutcomesuccess poutcomeunknown rowID
0 0.4825777 0.04121394 0.01124016 0.8385163 2256.931
1 1.1873112 0.08761329 0.16012085 0.6404834 2310.508
Coefficients of linear discriminants:
LD1
age 5.526507e-03
jobblue-collar -2.706623e-01
jobentrepreneur -1.101781e-01
jobhousemaid -4.140922e-01
jobmanagement -1.513013e-01
jobretired 1.724787e-01
jobself-employed -1.321104e-01
jobservices -1.716546e-01
jobstudent 2.436472e-01
jobtechnician -1.323998e-01
jobunemployed -1.829038e-01
jobunknown 2.229728e-01
maritalmarried -2.186959e-01
maritalsingle -3.863537e-02
educationsecondary 1.465623e-02
educationtertiary 1.404901e-01
educationunknown -1.826217e-01
defaultyes 3.462973e-01
balance -3.249330e-06
housingyes -1.046023e-02
loanyes -2.021002e-01
contacttelephone 1.099550e-02
contactunknown -5.924556e-01
day 9.017458e-03
monthaug -9.305973e-02
monthdec 2.389995e-01
monthfeb 1.174320e-01
monthjan -5.780757e-01
monthjul -3.597957e-01
monthjun 3.453750e-01
monthmar 1.672657e+00
monthmay -1.382086e-01
monthnov -4.616188e-01
monthoct 7.447575e-01
monthsep 7.407785e-01
duration 3.370065e-03
campaign -1.714388e-02
pdays 4.166779e-04
previous 3.380204e-02
poutcomeother 5.260135e-01
poutcomesuccess 3.088828e+00
poutcomeunknown 2.067692e-01
rowID 2.232217e-05
#head(TAB3)
predicted3 <- predict(mod.lda3, newdata = d_test)
TAB3 <- table(d_test$y, predicted3$class)
TAB3
0 1
0 1287 44
1 114 76
mklrate<- 1-sum(diag(TAB3))/sum(TAB3)
mklrate
[1] 0.103879
I used ‘balanced’ sampling strategy with 13% of no and 80% of yes. Thus I made a new ’balanced ’train sample of ~1000 samples and tested on test sample with 200 samples what did not improve the recall.
library(BalancedSampling)
# Select sample
set.seed(123);
N = 4521; # population size
n = 3000; # sample size
p = rep(n/N,N); # inclusion probabilities
X = cbind(p,runif(N),runif(N)); # matrix of auxiliary variables
s = cube(p,X); # select sample
head(s)
[1] 1020 2956 3194 866 799 352
# add row id
new_d_cc$id <- 1:nrow(new_d_cc)
group_no <- new_d_cc[ which(new_d_cc$y==0),]
group_yes <- new_d_cc[ which(new_d_cc$y==1),]
group_no
group_yes
#install.packages("splitstackshape", dependencies = TRUE)
library(splitstackshape)
set.seed(1)
sample.balanced.no <- stratified(group_no, c("y"), .13)
sample.balanced.no
sample.balanced.yes <- stratified(group_yes, c("y"), .8)
sample.balanced.yes
sample.combined <- rbind(sample.balanced.no, sample.balanced.yes)
sample.combined
commonID<-intersect(sample.combined$id, new_d_cc$id)
test_data_dd <- new_d_cc[-commonID,]
test_data_dd
plot(sample.combined$y)
sample.combined
test_data_dd$id<-NULL
sample.combined$id<-NULL
nrow(test_data_dd)
[1] 3584
mod.lda4 <-lda(y~., data = sample.combined)
variables are collinear
mod.lda4
Call:
lda(y ~ ., data = sample.combined)
Prior probabilities of groups:
0 1
0.5549626 0.4450374
Group means:
age balance day month duration campaign pdays previous job_admin. job_blue.collar
0 40.87500 1475.883 15.83846 6.748077 236.6788 2.938462 34.73269 0.4173077 0.1115385 0.2211538
1 42.21343 1605.247 15.88249 6.235012 551.8321 2.225420 69.70743 1.1294964 0.1175060 0.1342926
job_entrepreneur job_housemaid job_management job_retired job_self.employed job_services job_student
0 0.03653846 0.02307692 0.1884615 0.04615385 0.03846154 0.10961538 0.01538462
1 0.02637890 0.02158273 0.2565947 0.10311751 0.02877698 0.07434053 0.04076739
job_technician job_unemployed job_unknown marital_divorced marital_married marital_single
0 0.1750000 0.03076923 0.003846154 0.1288462 0.6134615 0.2576923
1 0.1606715 0.02158273 0.014388489 0.1462830 0.5275779 0.3261391
education_primary education_secondary education_tertiary education_unknown default_no default_yes
0 0.1634615 0.5173077 0.2653846 0.05384615 0.9769231 0.02307692
1 0.1175060 0.4868106 0.3573141 0.03836930 0.9832134 0.01678657
housing_no housing_yes loan_no loan_yes contact_cellular contact_telephone contact_unknown
0 0.4500000 0.5500000 0.8500000 0.15000000 0.6173077 0.04230769 0.3403846
1 0.5779376 0.4220624 0.9232614 0.07673861 0.8033573 0.08393285 0.1127098
poutcome_failure poutcome_other poutcome_success poutcome_unknown
0 0.09807692 0.04038462 0.005769231 0.8557692
1 0.13429257 0.07434053 0.158273381 0.6330935
Coefficients of linear discriminants:
LD1
age -3.600979e-03
balance 1.638638e-05
day 8.219898e-04
month -5.822340e-03
duration 2.626851e-03
campaign -5.699570e-02
pdays -5.055202e-04
previous 4.032731e-02
job_admin. 9.864568e-02
job_blue.collar -1.750189e-01
job_entrepreneur -2.127936e-01
job_housemaid -2.202679e-01
job_management 1.643140e-01
job_retired 6.500307e-01
job_self.employed -8.875918e-02
job_services -4.159753e-01
job_student 6.414710e-01
job_technician -5.230941e-02
job_unemployed -8.520246e-01
job_unknown 9.209075e-01
marital_divorced 1.198250e-01
marital_married -9.593275e-02
marital_single 4.481963e-02
education_primary -1.215675e-01
education_secondary -1.578210e-03
education_tertiary 1.497721e-01
education_unknown -3.642761e-01
default_no 3.426875e-02
default_yes -3.426875e-02
housing_no 6.846746e-02
housing_yes -6.846746e-02
loan_no 2.680189e-01
loan_yes -2.680189e-01
contact_cellular 2.447616e-01
contact_telephone 4.014668e-01
contact_unknown -4.261276e-01
poutcome_failure -1.810792e-01
poutcome_other 8.899898e-02
poutcome_success 1.592406e+00
poutcome_unknown -4.995688e-01
predicted4 <- predict(mod.lda4, newdata = test_data_dd)
TAB4 <- table(test_data_dd$y, predicted4$class)
TAB4
0 1
0 3063 417
1 40 64
Result with no balanced sampling
0 1
0 1298 44 1 98 81
mklrate<- 1-sum(diag(TAB4))/sum(TAB4)
mklrate
[1] 0.1275112